将新能源Excel文件转换为新能源CSV

In [1]:
import pandas as pd
import openpyxl
import re

xlsx = '新能源行业.xlsx'

df = pd.read_excel(xlsx)

exf = openpyxl.load_workbook(xlsx)
sheet = exf.active
C2 = sheet['C2']
C = sheet['C']

links = [c.value for c in C]

link_1=links[1:-1]

link_2=''.join(link_1)

p= re.compile('"(.*?),"(.*?)"')
list_of_tuple=p.findall(link_2)

df2=pd.DataFrame({'link':[t[0]for t in list_of_tuple],
                  'f_name':[t[1]for t in list_of_tuple]})
df2.to_csv('新能源行业.csv')

筛选出十个企业最新年报链接

In [3]:
import re
import pandas as pd
import os


df=pd.read_csv('新能源行业.csv')
p=re.compile('(?<=\d{4})(年报)|(年年报)')        
f_names=[p.sub(('年年度报'), f)for f in df.f_name]
df['f_name']=f_names;del p,f_names

def filter_links(words,df,include=True):
    Is=[]
    for word in words:
        if include:
            Is.append([word in f for f in df.f_name])
        else:
            Is.append([word not in f for f in df.f_name])
    index=[]
    for r in range(len(df)):
        flag=not include
        for c in range(len(words)):
            if include:
                flag = flag or Is[c][r]
            else:
                flag = flag and Is[c][r]
        index.append(flag)
    df2 = df[index]
    return(df2)

df_all = filter_links(['摘要','询问函','社会责任','审计','财务','风险','债券'],df,include=False)
df_orig = filter_links(['(','('], df_all,include=False)
df_updt = filter_links(['(','('], df_all,include=True)
df_updt = filter_links(['取消'],df_updt,include=False)
In [5]:
def sub_with_update(df_updt,df_orig):
    df_newest = df_orig.copy()
    index_orig = []
    index_updt = []
    for i,f in enumerate(df_orig.f_name):
        for j,fn in enumerate(df_updt.f_name):
            if f in fn:  
                index_orig.append(i)
                index_updt.append(j)        
    #return((index_orig,index_updt))
    for n in range(len(index_orig)):
        i = index_orig[n]
        j = index_updt[n]
        df_newest.iloc[i,-2] = df_updt.iloc[j,-2]
        #df_newest.iloc[i,-1] = df_updt.iloc[j,-1]
    return(df_newest)

df_newest = sub_with_update(df_updt,df_orig)
#index_orig,index_updt = sub_with_update(df_updt, df_orig)
            
df_newest.sort_values(by=['f_name'],inplace=True,ignore_index=True)
df_newest['公司简称'] = [f[:4] for f in df_newest.f_name]

counts = df_newest['公司简称'].value_counts()
In [6]:
ten_company = []
for cn in counts.index[:10]:
    ten_company.append(filter_links([cn], df_newest))
    
if not os.path.exists('10companies'):
    os.makedirs('10companies')
    
for df_com in ten_company:
    cn = df_com['公司简称'].iloc[0]
    df_com.to_csv('10companies/%s.csv' % cn)  
    
ten_csv = os.listdir('10companies')

将筛选出的十个企业的最新年报链接下载

In [1]:
import fitz
import os
os.chdir(r"C:\Users\dell\Desktop\limengfei\limengfei\final-project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'

pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
Out[1]:
['2020-04-25-002639.SZ-雪人股份:2019年年度报告.pdf',
 '2020-04-25-600218.SH-600218全柴动力2019年年度报告.pdf',
 '2020-04-27-300626.SZ-华瑞股份:2019年年度报告.pdf',
 '2020-04-27-600884.SH-600884杉杉股份2019年年度报告.pdf',
 '2020-04-29-002249.SZ-大洋电机:2019年年度报告.pdf',
 '2020-04-30-002158.SZ-汉钟精机:2019年年度报告.pdf',
 '2020-04-30-300540.SZ-深冷股份:2019年年度报告.pdf',
 '2021-03-26-600218.SH-600218全柴动力2020年年度报告.pdf',
 '2021-04-03-600884.SH-600884杉杉股份2020年年度报告.pdf',
 '2021-04-15-002340.SZ-格林美:2020年年度报告.pdf',
 '2021-04-22-002074.SZ-国轩高科:2020年年度报告.pdf',
 '2021-04-24-002639.SZ-雪人股份:2020年年度报告.pdf',
 '2021-04-26-300626.SZ-华瑞股份:2020年年度报告.pdf',
 '2021-04-27-300540.SZ-深冷股份:2020年年度报告.pdf',
 '2021-04-30-002158.SZ-汉钟精机:2020年年度报告.pdf']

搭建环境并读取年报中的营业收入内容

In [5]:
import fitz
import os
import re
import pandas as pd
os.chdir(r"C:\Users\dell\Desktop\limengfei\limengfei\final-project\10companies")
filenames = os.listdir() #把代码所在文件夹的所有文件和文件名查找出来
finalfix = '年年度报告'
In [17]:
pdf_list = [f for f in filenames if f.endswith('年年度报告.pdf')]
years = [f[-13:-9] for f in pdf_list]
pdf_list
Out[17]:
['2020-04-25-002639.SZ-雪人股份:2019年年度报告.pdf',
 '2020-04-25-600218.SH-600218全柴动力2019年年度报告.pdf',
 '2020-04-27-300626.SZ-华瑞股份:2019年年度报告.pdf',
 '2020-04-27-600884.SH-600884杉杉股份2019年年度报告.pdf',
 '2020-04-29-002249.SZ-大洋电机:2019年年度报告.pdf',
 '2020-04-30-002158.SZ-汉钟精机:2019年年度报告.pdf',
 '2020-04-30-300540.SZ-深冷股份:2019年年度报告.pdf',
 '2021-03-26-600218.SH-600218全柴动力2020年年度报告.pdf',
 '2021-04-03-600884.SH-600884杉杉股份2020年年度报告.pdf',
 '2021-04-15-002340.SZ-格林美:2020年年度报告.pdf',
 '2021-04-22-002074.SZ-国轩高科:2020年年度报告.pdf',
 '2021-04-24-002639.SZ-雪人股份:2020年年度报告.pdf',
 '2021-04-26-300626.SZ-华瑞股份:2020年年度报告.pdf',
 '2021-04-27-300540.SZ-深冷股份:2020年年度报告.pdf',
 '2021-04-30-002158.SZ-汉钟精机:2020年年度报告.pdf']
In [7]:
def extract_data(pdf):
    idx = pdf.find(':')  
    company_name = pdf[0:idx]
    year = pdf[idx+1:idx+5]
    #
    doc = fitz.open(pdf)  
    text = [page.get_text() for page in doc]  
    text = ''.join(text)  
    #
    p_s = re.compile(r'(?<=\n)\w{1,2}、.*?会计数据和财务指标\s*?(?=\n)')  
    section_match = p_s.search(text) 
    s_idx = section_match.start() 
    #
    p = re.compile('营业收入(.*?)归属于',re.DOTALL) 
    data_line = p.search(text[s_idx:]).group()
    data_line = data_line.replace('\n', '')
    p_digit = re.compile(r'(-)?\d[,0-9]*?\.\d{1,2}') 
    revenue = p_digit.search(data_line).group() 
    revenue = revenue.replace(',','')
    return((company_name,year,revenue))
In [9]:
companies, years, revenues = [],[],[]
for pdf in pdf_list:
    company, year, revenue = extract_data(pdf)
    companies.append(company)
    years.append(year)
    revenues.append(revenue)
    
df = pd.DataFrame({'company': companies,
                  'year': years,
                  'revenue': revenues})
df.sort_values('company',ignore_index=True, inplace=True)
df 
Out[9]:
company year revenue
0 2020-04-25-002639.SZ-雪人股份 2019 1513568678.62
1 2020-04-25-600218.SH-600218全柴动力2019年年度报告.pd 2020 4044447405.35
2 2020-04-27-300626.SZ-华瑞股份 2019 753352460.75
3 2020-04-27-600884.SH-600884杉杉股份2019年年度报告.pd 2020 8679910968.83
4 2020-04-29-002249.SZ-大洋电机 2019 8145952951.58
5 2020-04-30-002158.SZ-汉钟精机 2019 1806971546.97
6 2020-04-30-300540.SZ-深冷股份 2019 435349250.51
7 2021-03-26-600218.SH-600218全柴动力2020年年度报告.pd 2021 4455989703.12
8 2021-04-03-600884.SH-600884杉杉股份2020年年度报告.pd 2021 8215896691.46
9 2021-04-15-002340.SZ-格林美 2020 12466276299.40
10 2021-04-22-002074.SZ-国轩高科 2020 6724233230.56
11 2021-04-24-002639.SZ-雪人股份 2020 1458378294.45
12 2021-04-26-300626.SZ-华瑞股份 2020 729359322.62
13 2021-04-27-300540.SZ-深冷股份 2020 518723551.37
14 2021-04-30-002158.SZ-汉钟精机 2020 2272208133.84

定义函数抽取年报数据绘图

雪人股份

In [20]:
x = [2019,2020]
y = [1513568678.62,1458378294.45]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

全柴动力

In [21]:
x = [2019,2020]
y = [4044447405.35,4455989703.12]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

华瑞股份

In [22]:
x = [2019,2020]
y = [753352460.75,729359322.62]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

杉杉股份

In [23]:
x = [2019,2020]
y = [8679910968.83,8215896691.46]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

汉钟精机

In [25]:
x = [2019,2020]
y = [1806971546.97,2272208133.84]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

深冷股份

In [26]:
x = [2019,2020]
y = [435349250.51,518723551.37]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()

格林美

In [27]:
x = [2019,2020]
y = [8145952951.58,12466276299.40]
plt.title('revenue changing during 2019-2020') 
plt.xlabel("year") 
plt.ylabel("revenue") 
plt.plot(x,y)
plt.show()